Datierung¶
Das Notebook ergänzt den Anhang 'Methoden' und widmet sich der automatischen Datierung von Gedichten.
Import¶
In [1]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, ARDRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
import itertools
Initial Setup¶
recreate meta¶
In [2]:
def read_metadata (path):
metadata = pd.read_csv(path, sep=';', low_memory=False)
new_header = metadata.iloc[0]
metadata = metadata[2:]
metadata.columns = new_header
return metadata
def get_anthologies (data, RemoveVolume = False):
anthologies = data.iloc[:,pos_id].values.tolist() # get ids
# delete .001 etc. (point - 3 digits - any non-word characters - end of word)
anthologies = [re.sub("\\.[0-9][0-9][0-9][^A-Za-z]*$", "", x) for x in anthologies]
if RemoveVolume :
anthologies = [re.sub("[0-9]$", "", x) for x in anthologies]
anthologies = [re.sub("\\.$", "", x) for x in anthologies]
return anthologies
def get_anthologies_years (data, always_get_year_of_first_ed = True):
anthologies = get_anthologies(data)
anthologies_years = []
anthologies_years_first_ed = [re.findall("[0-9]{4}", x) for x in anthologies]
anthologies_years_later_ed = [re.findall("\\([0-9]{4}\\)", x) for x in anthologies]
for i in range(len(anthologies)):
if always_get_year_of_first_ed or anthologies_years_later_ed[i] == []:
anthologies_years.append(anthologies_years_first_ed[i][0])
else:
anthologies_years.append(re.findall("[0-9]{4}", anthologies_years_later_ed[i][0])[0])
anthologies_years = [int(x) for x in anthologies_years]
return anthologies_years
In [3]:
meta = pd.DataFrame()
pos_id = 0
pos_authors_names = 3
pos_lifetimes_birth = 6
pos_lifetimes_death = 7
pos_title_unified = 9
pos_text_written = 10
pos_text_published = 11
annotations = read_metadata("../resources/more/annotations.csv")
meta['id'] = annotations.iloc[:,pos_id].tolist()
meta['anthology'] = get_anthologies(annotations, RemoveVolume = True)
meta['anthology_with_volume'] = get_anthologies(annotations, RemoveVolume = False)
meta['anthology_year_first_ed'] = get_anthologies_years(annotations, always_get_year_of_first_ed = True)
meta['anthology_year_used_ed'] = get_anthologies_years(annotations, always_get_year_of_first_ed = False)
meta['corpus'] = ['add' if x == '1920.Pinthus' or x == '2022.GeschAddMod' else 'anth' for x in meta['anthology']]
meta['author'] = annotations.iloc[:,pos_authors_names].tolist()
author_birth = []
for x in annotations.iloc[:,pos_lifetimes_birth].tolist():
try:
author_birth.append(int(x[0:4]))
except:
author_birth.append(float('NaN'))
meta['author_birth'] = author_birth
author_death = []
for x in annotations.iloc[:,pos_lifetimes_death].tolist():
try:
author_death.append(int(x[0:4]))
except:
author_death.append(float('NaN'))
meta['author_death'] = author_death
meta['title'] = annotations.iloc[:,pos_title_unified].tolist()
meta['author_title'] = meta['author'] + ' – ' + meta['title']
get basic date data¶
In [4]:
def get_year_search_status (meta):
written = meta.iloc[:,pos_text_written].tolist()
published = meta.iloc[:,pos_text_published].tolist()
results = []
for this_written, this_published in zip(written, published):
if any(char.isdigit() for char in str(this_written)) or any(char.isdigit() for char in str(this_published)):
results.append('searched_and_found')
elif '/' in str(this_written) or '/' in str(this_published):
results.append('searched_but_not_found')
else:
results.append('not_searched')
return results
def get_written_and_published (meta, get_verified_only = False):
written = meta.iloc[:,pos_text_written].tolist()
published = meta.iloc[:,pos_text_published].tolist()
if get_verified_only:
written = [x if str(x) != 'nan' and 'verified' in str(x) else float('NaN') for x in written]
published = [x if str(x) != 'nan' and 'verified' in str(x) else float('NaN') for x in published]
# convert to int
written_int = []
for x in written:
try:
x_clean = re.sub('\\.(.*)', '', re.sub('\\ (.*)', '', str(x)))
written_int.append(int(x_clean))
except:
written_int.append(float('NaN'))
published_int = []
for x in published:
x_clean = re.sub('\\.(.*)', '', re.sub('\\ (.*)', '', str(x)))
try:
published_int.append(int(x_clean))
except:
published_int.append(float('NaN'))
return [written_int, published_int]
def get_years_gt (written, published):
years_gt = []
for i in range(len(written)):
if str(written[i]) != 'nan': years_gt.append(written[i])
elif str(published[i]) != 'nan': years_gt.append(published[i])
else: years_gt.append(float('NaN'))
return years_gt
In [5]:
meta['year_search_status'] = get_year_search_status(annotations)
meta['written_gt'] = get_written_and_published(annotations, get_verified_only = False)[0]
meta['published_gt'] = get_written_and_published(annotations, get_verified_only = False)[1]
meta['year_gt'] = get_years_gt(meta.written_gt.tolist(), meta.published_gt.tolist())
In [6]:
print(f"Manuell recherchierte Jahre : {meta['year_gt'].dropna().shape[0]}")
Manuell recherchierte Jahre : 3507
prepare train/test data¶
In [7]:
data = (
meta
.sort_values(by='year_gt', na_position='last')
.drop_duplicates(subset="author_title")
.sort_values(by='author_title')
.reset_index(drop=True)
.copy()
)
data = {
'author_title' : data['author_title'].tolist(),
'author_birth_year': data['author_birth'].tolist(),
'author_death_year': data['author_death'].tolist(),
# 'author_lifespan': (data['author_death']-data['author_birth']).tolist(),
'first_anth_year': meta.groupby('author_title')['anthology_year_used_ed'].min().tolist(),
'mean_anth_year': meta.groupby('author_title')['anthology_year_used_ed'].mean().tolist(),
'last_anth_year': meta.groupby('author_title')['anthology_year_used_ed'].max().tolist(),
'text_count' : meta.groupby('author_title').size().tolist(),
'year_gt' : data['year_gt'].tolist()
}
data = pd.DataFrame(data)
data = data.sample(frac=1, random_state=0).reset_index(drop=True)
In [8]:
print(data.shape[0])
data.head()
10446
Out[8]:
| author_title | author_birth_year | author_death_year | first_anth_year | mean_anth_year | last_anth_year | text_count | year_gt | |
|---|---|---|---|---|---|---|---|---|
| 0 | Maltiz, Friedrich Franz Apollonius – Schicksal... | 1794.0 | 1857.0 | 1840 | 1869.900000 | 1909 | 10 | NaN |
| 1 | Keiter, Therese – Die Königin | 1859.0 | 1925.0 | 2022 | 2022.000000 | 2022 | 1 | NaN |
| 2 | Kolmar, Gertrud – Marats Antlitz | 1894.0 | 1943.0 | 2022 | 2022.000000 | 2022 | 1 | 1934.0 |
| 3 | Wildenbruch, Ernst von – Dem Fürsten Bismarck | 1845.0 | 1909.0 | 1903 | 1930.166667 | 1981 | 6 | 1890.0 |
| 4 | Schollmeyer, Johann Georg – Fürsten-Größe im S... | 1768.0 | 1839.0 | 1827 | 1827.000000 | 1827 | 1 | NaN |
In [9]:
possible_features = [
'author_birth_year',
'author_death_year',
# 'author_lifespan',
'first_anth_year',
'mean_anth_year',
'last_anth_year',
'text_count',
]
In [10]:
# data_traintest: texts where complete data (especially year_gt) is available
data_traintest = data.dropna(subset=possible_features + ['year_gt'])
print(data_traintest.shape[0])
data_traintest.head()
3395
Out[10]:
| author_title | author_birth_year | author_death_year | first_anth_year | mean_anth_year | last_anth_year | text_count | year_gt | |
|---|---|---|---|---|---|---|---|---|
| 2 | Kolmar, Gertrud – Marats Antlitz | 1894.0 | 1943.0 | 2022 | 2022.000000 | 2022 | 1 | 1934.0 |
| 3 | Wildenbruch, Ernst von – Dem Fürsten Bismarck | 1845.0 | 1909.0 | 1903 | 1930.166667 | 1981 | 6 | 1890.0 |
| 6 | Hagenbach, Karl Rudolf – Das Feuerzeichen | 1801.0 | 1874.0 | 1840 | 1864.875000 | 1891 | 8 | 1839.0 |
| 7 | Miegel, Agnes – Jane | 1874.0 | 1964.0 | 2022 | 2022.000000 | 2022 | 1 | 1905.0 |
| 9 | Blomberg, Hugo von – Des alten Dessauers Gebet | 1820.0 | 1871.0 | 1867 | 1885.000000 | 1898 | 4 | 1860.0 |
In [11]:
X = data_traintest[possible_features]
y = data_traintest['year_gt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True)
print(X_train.shape[0])
print(X_test.shape[0])
2716 679
In [12]:
kf = KFold(
n_splits=5,
shuffle=True,
random_state=1
)
Approach 1: ages mean¶
train/test¶
In [13]:
# cv
ages_means, mae, mse, r2 = [], [], [], []
for train_index, test_index in kf.split(data_traintest):
data_train = data_traintest.iloc[train_index].copy()
data_test = data_traintest.iloc[test_index].copy()
ages_train = (data_train['year_gt']-data_train['author_birth_year']).dropna().tolist()
ages_train_mean = np.mean(ages_train)
data_test['year_predict_ages_mean'] = data_test['author_birth_year'] + ages_train_mean
errors = data_test['year_predict_ages_mean'] - data_test['year_gt']
errors = abs(errors.dropna())
ages_means.append(ages_train_mean)
mae.append(errors.mean())
mse.append((errors*errors).mean())
r2.append(r2_score(data_test['year_gt'], data_test['year_predict_ages_mean']))
print(f"CV Ages Mean : {np.mean(ages_means)}")
print(f"CV Mean Absolute Error : {np.mean(mae)}")
print(f"CV Mean Squared Error : {np.mean(mse)}")
print(f"CV R2 : {np.mean(r2)}")
CV Ages Mean : 39.28718703976436 CV Mean Absolute Error : 9.898268700614482 CV Mean Squared Error : 147.47002896705504 CV R2 : 0.8340278398369471
In [14]:
# test set
ages_train = (y_train-X_train['author_birth_year']).dropna().tolist()
ages_train_mean = np.mean(ages_train)
y_predict = X_test['author_birth_year'] + ages_train_mean
errors = y_predict-y_test
errors = abs(errors.dropna())
print(f"Ages Mean : {ages_train_mean}")
print(f"Mean Absolute Error : {errors.mean()}")
print(f"Mean Squared Error : {(errors*errors).mean()}")
print(f"R2 : {r2_score(y_test, y_predict)}")
Ages Mean : 39.18262150220913 Mean Absolute Error : 9.825601193820066 Mean Squared Error : 148.14535149802308 R2 : 0.8172553603318362
predict¶
In [15]:
ages = (data['year_gt']-data['author_birth_year']).dropna().tolist()
ages_mean = np.mean(ages)
In [16]:
meta['year_predict_ages_mean'] = meta['author_birth'] + ages_mean
In [17]:
print(ages_mean)
print(meta.query('year_gt.notna()')['year_predict_ages_mean'].mean())
39.2524609148813 1872.4391506296618
Approach 2: year middle¶
In [18]:
def get_year_middle (element, min_age = 18):
try:
year_min = element['author_birth_year']+min_age
year_max = np.nanmin([element['author_death_year'], element['first_anth_year']])
year_middle = (year_min + year_max)/2
return year_middle
except:
return float('NaN')
train/test¶
In [19]:
# cv
mae, mse, r2 = [], [], []
for train_index, test_index in kf.split(data_traintest):
data_train = data_traintest.iloc[train_index].copy()
data_test = data_traintest.iloc[test_index].copy()
ages_train = (data_train['year_gt']-data_train['author_birth_year']).dropna().tolist()
ages_train_min = np.min(ages_train)
data_test['year_predict_middle'] = [get_year_middle(element, min_age = ages_train_min) for element in data_test.iloc]
errors = data_test['year_predict_middle'] - data_test['year_gt']
errors = abs(errors.dropna())
mae.append(errors.mean())
mse.append((errors*errors).mean())
r2.append(r2_score(data_test['year_gt'], data_test['year_predict_middle']))
print(f"CV Mean Absolute Error : {np.mean(mae)}")
print(f"CV Mean Squared Error : {np.mean(mse)}")
print(f"CV R2 : {np.mean(r2)}")
CV Mean Absolute Error : 9.270250368188513 CV Mean Squared Error : 128.98549337260678 CV R2 : 0.8545560243415633
In [20]:
# test set
ages_train = (y_train-X_train['author_birth_year']).dropna().tolist()
ages_train_min = np.min(ages_train)
y_predict = [get_year_middle(element, min_age = ages_train_min) for element in X_test.iloc]
errors = y_predict-y_test
errors = abs(errors.dropna())
print(f"Mean Absolute Error : {errors.mean()}")
print(f"Mean Squared Error : {(errors*errors).mean()}")
print(f"R2 : {r2_score(y_test, y_predict)}")
Mean Absolute Error : 9.66642120765832 Mean Squared Error : 137.92746686303389 R2 : 0.8298596278765852
predict¶
In [21]:
data_predict = data.dropna(subset=['author_birth_year', 'author_death_year', 'first_anth_year']).copy()
data_predict['year_predict'] = [get_year_middle(element) for element in data_predict.iloc]
data_predict = data_predict.reset_index(drop=True)
year_predict_dic = dict(zip(data_predict['author_title'], data_predict['year_predict']))
In [22]:
for i, element in enumerate(meta.iloc):
this_author_title = element['author_title']
if this_author_title in data_predict['author_title'].tolist():
meta.at[i, 'year_predict_middle'] = year_predict_dic[this_author_title]
Approach 3: machine learning¶
compare models¶
In [23]:
scoring_functions = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
def eval_model (model, features=possible_features):
results = pd.DataFrame()
scores = cross_validate(model, X[features], y, cv=kf, scoring=scoring_functions)
results.at['cv_complete_set', 'mean_absolute_error'] = np.mean(-scores['test_neg_mean_absolute_error'])
results.at['cv_complete_set', 'mean_squared_error'] = np.mean(-scores['test_neg_mean_squared_error'])
results.at['cv_complete_set', 'r2'] = np.mean(scores['test_r2'])
scores = cross_validate(model, X_train[features], y_train, cv=kf, scoring=scoring_functions)
results.at['cv_train_set', 'mean_absolute_error'] = np.mean(-scores['test_neg_mean_absolute_error'])
results.at['cv_train_set', 'mean_squared_error'] = np.mean(-scores['test_neg_mean_squared_error'])
results.at['cv_train_set', 'r2'] = np.mean(scores['test_r2'])
model.fit(X_train[features], y_train)
y_test_predict = model.predict(X_test[features])
results.at['test_set', 'mean_absolute_error'] = mean_absolute_error(y_test_predict, y_test)
results.at['test_set', 'mean_squared_error'] = mean_squared_error(y_test_predict, y_test)
results.at['test_set', 'r2'] = r2_score(y_test_predict, y_test)
return results
In [24]:
def get_errors (model, features=possible_features, mode='cv_complete_set'):
if mode == 'cv_complete_set':
y_predict = cross_val_predict(model, X[features], y, cv=kf)
errors = np.array(y-y_predict)
elif mode == 'cv_train_set':
y_predict = cross_val_predict(model, X_train[features], y_train, cv=kf)
errors = np.array(y_train-y_predict)
elif mode == 'test_set':
model.fit(X_train[features], y_train)
y_predict = model.predict(X_test[features])
errors = np.array(y_test-y_predict)
return errors
def visualize_errors (errors):
fig = px.histogram(errors, labels={'value': 'error'})
fig.update_layout(showlegend=False)
fig.show()
errors_mean = np.mean(errors)
errors_mean_abs = np.mean(np.abs(errors))
print(f"mean_absolute_error : {round(errors_mean_abs, 4)}")
print(f"mean_error : {round(errors_mean, 4)}")
In [25]:
models = {
'linear_regressor' : LinearRegression(),
'ridge_regressor' : Ridge(),
'ard_regressor' : ARDRegression(),
'kernel_ridge_regressor' : KernelRidge(),
'kneighbors_regressor' : KNeighborsRegressor(),
'adaboost_regressor' : AdaBoostRegressor(random_state=42),
'bagging_regressor' : BaggingRegressor(random_state=42),
'gradient_boosting_regressor' : GradientBoostingRegressor(random_state=42),
'hist_gradient_boosting_regressor' : HistGradientBoostingRegressor(random_state=42),
'random_forest_regressor' : RandomForestRegressor(random_state=42),
'mlp_regressor' : MLPRegressor(random_state=42, max_iter=1000),
}
In [26]:
for model_name in tqdm(models):
results = eval_model(models[model_name])
print(f"\n\n{model_name}")
print(results)
0%| | 0/11 [00:00<?, ?it/s]
linear_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 9.060317 124.456057 0.860107
cv_train_set 9.041652 123.670930 0.862956
test_set 9.178702 127.921145 0.809713
ridge_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 9.060317 124.456054 0.860107
cv_train_set 9.041651 123.670922 0.862956
test_set 9.178705 127.921176 0.809713
ard_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 9.063494 124.434870 0.860128
cv_train_set 9.042844 123.615798 0.863013
test_set 9.182157 127.940928 0.809636
kernel_ridge_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 9.387051 131.611980 0.851872
cv_train_set 9.390244 131.541098 0.854253
test_set 9.439567 133.774931 0.832859
kneighbors_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 7.137378 105.023682 0.883156
cv_train_set 7.416666 113.395689 0.875121
test_set 7.430928 104.270987 0.854032
adaboost_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 8.792977 126.577010 0.858701
cv_train_set 8.736359 125.461120 0.861234
test_set 9.006384 126.371332 0.802456
bagging_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 6.180390 90.009922 0.899971
cv_train_set 6.214439 90.131272 0.900602
test_set 6.234904 85.580274 0.884559
gradient_boosting_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 7.037911 90.776419 0.898913
cv_train_set 7.059796 91.343437 0.899390
test_set 7.274541 90.095518 0.871088
hist_gradient_boosting_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 6.500676 85.286779 0.905082
cv_train_set 6.528512 87.581662 0.903757
test_set 6.828010 87.893407 0.880195
random_forest_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 6.028631 84.411729 0.906031
cv_train_set 6.115095 86.433683 0.904782
test_set 6.108529 82.338954 0.887943
mlp_regressor
mean_absolute_error mean_squared_error r2
cv_complete_set 9.870442 147.816524 0.834483
cv_train_set 10.016732 153.006094 0.831131
test_set 9.681776 138.974099 0.824130
In [27]:
random_forest_regressor = RandomForestRegressor(random_state=42)
eval_model(random_forest_regressor)
pd.DataFrame({'feature' : possible_features, 'importance' : random_forest_regressor.feature_importances_})
Out[27]:
| feature | importance | |
|---|---|---|
| 0 | author_birth_year | 0.862607 |
| 1 | author_death_year | 0.056364 |
| 2 | first_anth_year | 0.051383 |
| 3 | mean_anth_year | 0.013611 |
| 4 | last_anth_year | 0.010950 |
| 5 | text_count | 0.005086 |
In [28]:
errors = get_errors(random_forest_regressor, mode='cv_complete_set')
visualize_errors(errors)
mean_absolute_error : 6.0286 mean_error : -0.0307
test random states¶
In [29]:
random_forest_regressor = RandomForestRegressor(random_state=42)
In [30]:
results = pd.DataFrame()
for i in tqdm(range(30)):
kf_rs = KFold(n_splits=5, shuffle=True, random_state=i)
scores = cross_validate(random_forest_regressor, X, y, cv=kf_rs, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))
results.at[i, 'cv_mae'] = np.mean(-scores['test_neg_mean_absolute_error'])
results.at[i, 'cv_mse'] = np.mean(-scores['test_neg_mean_squared_error'])
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X, y, test_size=0.2, random_state=i, shuffle=True)
random_forest_regressor.fit(X_train_rs, y_train_rs)
y_test_predict = random_forest_regressor.predict(X_test_rs)
results.at[i, 'mae'] = mean_absolute_error(y_test_predict, y_test_rs)
results.at[i, 'mse'] = mean_squared_error(y_test_predict, y_test_rs)
0%| | 0/30 [00:00<?, ?it/s]
In [31]:
px.box(results, y = ['cv_mae', 'mae'], points='all', labels={'variable':'', 'value':''}, hover_name=results.index).show()
px.box(results, y = ['cv_mse', 'mse'], points='all', labels={'variable':'', 'value':''}, hover_name=results.index).show()
improve model¶
https://www.kaggle.com/code/marcinrutecki/gridsearchcv-kfold-cv-the-right-way
use only training set for tuning, evaluate on test set later
In [32]:
preferred_scoring = 'mean_squared_error'
get best features¶
(exhaustive search)
In [33]:
def all_combinations(items):
result = []
for r in range(1, len(items) + 1):
combinations = itertools.combinations(items, r)
result.extend(combinations)
return result
possible_feature_combinations = all_combinations(possible_features)
possible_feature_combinations = [list(combination) for combination in possible_feature_combinations]
In [34]:
feature_combinations_scores = pd.DataFrame()
for i, feature_combination in enumerate(tqdm(possible_feature_combinations)):
scores = cross_validate(
random_forest_regressor,
X_train[feature_combination],
y_train,
cv=kf,
scoring=scoring_functions,
)
mae = np.mean(-scores['test_neg_mean_absolute_error'])
mse = np.mean(-scores['test_neg_mean_squared_error'])
r2 = np.mean(scores['test_r2'])
for feature in possible_features:
feature_combinations_scores.at[i, feature] = 1 if feature in feature_combination else 0
feature_combinations_scores.at[i, 'mean_absolute_error'] = mae
feature_combinations_scores.at[i, 'mean_squared_error'] = mse
feature_combinations_scores.at[i, 'r2'] = r2
0%| | 0/63 [00:00<?, ?it/s]
In [35]:
best_feature_combination = feature_combinations_scores.sort_values(by=preferred_scoring).head(1)
best_feature_combination = best_feature_combination.columns[best_feature_combination.iloc[0] == 1].tolist()
In [36]:
pd.concat([
feature_combinations_scores.sort_values(by=preferred_scoring).head(5),
feature_combinations_scores.sort_values(by=preferred_scoring).tail(5)
])
Out[36]:
| author_birth_year | author_death_year | first_anth_year | mean_anth_year | last_anth_year | text_count | mean_absolute_error | mean_squared_error | r2 | |
|---|---|---|---|---|---|---|---|---|---|
| 42 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 6.053746 | 85.468251 | 0.905770 |
| 58 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 6.084587 | 85.804644 | 0.905456 |
| 62 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 6.115095 | 86.433683 | 0.904782 |
| 56 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 6.106740 | 86.580804 | 0.904561 |
| 21 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 5.984260 | 86.834208 | 0.904350 |
| 19 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 15.004678 | 415.485917 | 0.542472 |
| 20 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 15.502078 | 440.388709 | 0.513857 |
| 3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 15.889946 | 468.937691 | 0.482761 |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 17.128785 | 537.446106 | 0.406882 |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 23.111350 | 811.555413 | 0.108981 |
get best params¶
In [37]:
param_grid = {
'n_estimators': [100, 400, 800, 1000],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
}
grid_search = GridSearchCV(
estimator=random_forest_regressor,
param_grid=param_grid,
cv=kf,
scoring='neg_'+preferred_scoring,
n_jobs=-1,
verbose=1
)
grid_search.fit(
X_train[best_feature_combination],
y_train
)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Out[37]:
GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
param_grid={'max_depth': [10, 20, 30], 'min_samples_leaf': [1, 2],
'min_samples_split': [2, 5],
'n_estimators': [100, 400, 800, 1000]},
scoring='neg_mean_squared_error', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
param_grid={'max_depth': [10, 20, 30], 'min_samples_leaf': [1, 2],
'min_samples_split': [2, 5],
'n_estimators': [100, 400, 800, 1000]},
scoring='neg_mean_squared_error', verbose=1)RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=400,
random_state=42)RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=400,
random_state=42)In [38]:
best_params = grid_search.best_params_
best_score = -grid_search.best_score_
print("Feature importance :", grid_search.best_estimator_.feature_importances_)
print("Best CV score :", best_score)
Feature importance : [0.88777482 0.05129049 0.04988355 0.01105114] Best CV score : 83.0962694685561
In [39]:
print(f"best feature combination : {best_feature_combination}")
print(f"best parameters : {best_params}")
best feature combination : ['author_birth_year', 'author_death_year', 'first_anth_year', 'last_anth_year']
best parameters : {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}
eval improved model¶
In [40]:
# only results for test set are informative
best_random_forest_regressor = RandomForestRegressor(**best_params, random_state=42)
eval_model(best_random_forest_regressor, features=best_feature_combination)
Out[40]:
| mean_absolute_error | mean_squared_error | r2 | |
|---|---|---|---|
| cv_complete_set | 6.226239 | 81.173910 | 0.909564 |
| cv_train_set | 6.271163 | 83.096269 | 0.908548 |
| test_set | 6.403285 | 81.055716 | 0.887285 |
predict¶
In [41]:
# vanilla
final_feature_combination = possible_features
final_params = RandomForestRegressor(random_state=42).get_params()
X_train_final = X
y_train_final = y
# improved
# final_feature_combination = best_feature_combination
# final_params = best_params
# final_params.update({"random_state":42})
# X_train_final = X_train
# y_train_final = y_train
In [42]:
# create model
random_forest_regressor = RandomForestRegressor(**final_params)
random_forest_regressor.fit(X_train_final[final_feature_combination], y_train_final)
Out[42]:
RandomForestRegressor(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
In [43]:
# apply model to data
data_predict = data.dropna(subset=final_feature_combination).copy()
data_predict['year_predict'] = random_forest_regressor.predict(data_predict[final_feature_combination])
data_predict = data_predict.reset_index(drop=True)
year_predict_dic = dict(zip(data_predict['author_title'], data_predict['year_predict']))
In [44]:
for i, element in enumerate(meta.iloc):
this_author_title = element['author_title']
if this_author_title in data_predict['author_title'].tolist():
meta.at[i, 'year_predict_rfr'] = year_predict_dic[this_author_title]
Combine Predictions¶
In [45]:
rfr_available = meta.query("year_predict_rfr.notna()").shape[0]
middle_available = meta.query("year_predict_middle.notna()").shape[0]
ages_mean_available = meta.query("year_predict_ages_mean.notna()").shape[0]
print(f"all texts : {meta.shape[0]}\n")
print(f"Available Predictions")
print(f"random forest : {rfr_available}")
print(f"year middle : {middle_available}")
print(f"ages mean : {ages_mean_available}")
all texts : 21303 Available Predictions random forest : 18148 year middle : 18148 ages mean : 18428
In [46]:
earliest_anthology_year_dict = meta.groupby('author_title')['anthology_year_used_ed'].min().to_dict()
year_gt_filled = meta.groupby('author_title')['year_gt'].transform(lambda x: x.ffill().bfill())
for i, element in enumerate(meta.iloc):
final_year = float('NaN')
this_author_title = element['author_title']
this_year_gt = year_gt_filled.iloc[i]
this_year_predict_rfr = element['year_predict_rfr']
this_year_predict_ages_mean = element['year_predict_ages_mean']
this_birth_year = element['author_birth']
this_death_year = element['author_death']
this_earliest_anthology_year = earliest_anthology_year_dict[this_author_title]
if element.id == '1912.Werner.150':
print()
if pd.notna(this_year_gt):
final_year = this_year_gt
elif pd.notna(this_year_predict_rfr):
final_year = round(this_year_predict_rfr)
elif pd.notna(this_year_predict_ages_mean):
final_year = round(this_year_predict_ages_mean)
# check (nicht vor Geburtsdatum, nicht nach Tod, nicht nach erster Anthologie)
this_min_possible = this_birth_year
this_max_possible = np.nanmin([this_death_year, this_earliest_anthology_year])
if final_year < this_min_possible:
final_year = this_min_possible
elif final_year > this_max_possible:
final_year = this_max_possible
meta.at[i, 'year'] = final_year
Check¶
In [47]:
print("year_gt")
print(f"year_gt (count) : {meta['year_gt'].dropna().shape[0]}")
print(f"year_gt (mean) : {meta['year_gt'].mean()}")
print(f"ages (mean) : {(meta['year_gt'].dropna()-meta['author_birth'].dropna()).mean()}")
print("\nages_mean")
print(f"year_predict (mean) : {meta.query('year_gt.notna()')['year_predict_ages_mean'].mean()}")
print(f"Mean Absolute Error : {np.mean(abs(meta['year_predict_ages_mean']-meta['year_gt']))}")
print(f"Mean Squared Error : {np.mean(abs(meta['year_predict_ages_mean']-meta['year_gt'])**2)}")
print("\nrandom forest")
print(f"year_predict (mean) : {meta.query('year_gt.notna()')['year_predict_rfr'].mean()}")
print(f"Mean Absolute Error : {np.mean(abs(meta['year_predict_rfr']-meta['year_gt']))}")
print(f"Mean Squared Error : {np.mean(abs(meta['year_predict_rfr']-meta['year_gt'])**2)}")
year_gt year_gt (count) : 3507 year_gt (mean) : 1871.9709153122326 ages (mean) : 39.2149236531259 ages_mean year_predict (mean) : 1872.4391506296618 Mean Absolute Error : 9.896605791689172 Mean Squared Error : 147.65328658988872 random forest year_predict (mean) : 1872.3506071709696 Mean Absolute Error : 3.1869898798383254 Mean Squared Error : 27.88486060124036
In [48]:
# Fehlende Datierungen?
pd.set_option('display.width', 1000)
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']
results_searched = (
meta
.query("1850 <= year <= 1918")
.query("corpus == 'anth' or author in @modcanon_authors or author in @muench_authors")
.query("year_search_status != 'not_searched'")
.sort_values(by = 'year')
)
results_notsearched = (
meta
.query("1850 <= year <= 1918")
.query("corpus == 'anth' or author in @modcanon_authors or author in @muench_authors")
.query("year_search_status == 'not_searched'")
.drop_duplicates(subset='author_title')
.sort_values(by = 'year')
)
results_notsearched = results_notsearched[~results_notsearched['author_title'].isin(results_searched['author_title'])]
if results_notsearched.shape[0] == 0:
print('Keine fehlenden Datierungen')
else:
print('Fehlende Datierungen:\n')
print(results_notsearched[['id', 'author', 'title', 'year', 'year_gt', 'corpus']])
test_ids = ['1892/93.Tetzner.1.067', '1891.Brümmer.521']
test_results = meta.query("id.isin(@test_ids)")
print("\nTests:")
print(test_results[['id', 'author', 'title', 'year', 'year_gt', 'year_predict_ages_mean', 'year_predict_rfr']])
Keine fehlenden Datierungen
Tests:
id author title year year_gt year_predict_ages_mean year_predict_rfr
13395 1891.Brümmer.521 Döring, Moritz Die weiße Kuh von Courcelles 1850.0 NaN 1837.252461 1849.705
13789 1892/93.Tetzner.1.067 Dahn, Felix Gotenzug 1876.0 1876.0 1873.252461 1875.150